pacman::p_load(tidyverse, haven, ggrepel, patchwork,
ggthemes, hrbrthemes)Take-home Exercise 1
Project Brief
OECD education director Andreas Schleicher shared in a BBC article that “Singapore managed to achieve excellence without wide differences between children from wealthy and disadvantaged families.” (2016) Furthermore, several Singapore’s Minister for Education also started an “every school a good school” slogan.
The general public, however, strongly belief that there are still disparities that exist, especially between the elite schools and neighborhood school, between students from families with higher socioeconomic status and those with relatively lower socioeconomic status and immigration and non-immigration families.
Project Objectives
The 2022 Programme for International Student Assessment (PISA) data was released on December 5, 2022. PISA global education survey every three years to assess the education systems worldwide through testing 15 year old students in the subjects of mathematics, reading, and science.
In this take-home exercise, we are required to use appropriate Exploratory Data Analysis (EDA) methods and ggplot2 functions to reveal:
the distribution of Singapore students’ performance in mathematics, reading, and science, and
the relationship between these performances with schools, gender and socioeconomic status of the students.
Limit your submission to not more than five EDA visualisation.
1. Data Preparation
1.1 Installing R packages
pacman::p_load() function from the pacman package is used in the following code chunk to install and call the libraries of multiple R packages:
1.2 Importing Dataset
“Student questionnaire data file” from the PISA 2022 database is provided for the task.
The code chunk below uses read_sas() of haven to import PISA data into R environment.
stu_qqq <- read_sas("data/STU_QQQ_SAS/cy08msp_stu_qqq.sas7bdat")stu_qqq_SG <- stu_qqq %>%
filter(CNT == "SGP")write_rds(stu_qqq_SG,
"data/STU_QQQ_SAS/stu_qqq_SG.rds")We upload the file as stu_qqq_Sg.
stu_qqq_Sg <-
read_rds("data/STU_QQQ_SAS/stu_qqq_SG.rds")1.3 Summary Statistics
Display first 5 rows using head()
head(stu_qqq_Sg, 5)# A tibble: 5 × 1,279
CNT CNTRYID CNTSCHID CNTSTUID CYC NatCen STRATUM SUBNATIO REGION OECD
<chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 SGP 702 70200052 70200001 08MS 070200 SGP01 7020000 70200 0
2 SGP 702 70200134 70200002 08MS 070200 SGP01 7020000 70200 0
3 SGP 702 70200112 70200003 08MS 070200 SGP01 7020000 70200 0
4 SGP 702 70200004 70200004 08MS 070200 SGP01 7020000 70200 0
5 SGP 702 70200152 70200005 08MS 070200 SGP01 7020000 70200 0
# ℹ 1,269 more variables: ADMINMODE <dbl>, LANGTEST_QQQ <dbl>,
# LANGTEST_COG <dbl>, LANGTEST_PAQ <dbl>, Option_CT <dbl>, Option_FL <dbl>,
# Option_ICTQ <dbl>, Option_WBQ <dbl>, Option_PQ <dbl>, Option_TQ <dbl>,
# Option_UH <dbl>, BOOKID <dbl>, ST001D01T <dbl>, ST003D02T <dbl>,
# ST003D03T <dbl>, ST004D01T <dbl>, ST250Q01JA <dbl>, ST250Q02JA <dbl>,
# ST250Q03JA <dbl>, ST250Q04JA <dbl>, ST250Q05JA <dbl>, ST250D06JA <chr>,
# ST250D07JA <chr>, ST251Q01JA <dbl>, ST251Q02JA <dbl>, ST251Q03JA <dbl>, …
Check the structure of stu_qqq_Sg
str(stu_qqq_Sg)tibble [6,606 × 1,279] (S3: tbl_df/tbl/data.frame)
$ CNT : chr [1:6606] "SGP" "SGP" "SGP" "SGP" ...
..- attr(*, "label")= chr "Country code 3-character"
$ CNTRYID : num [1:6606] 702 702 702 702 702 702 702 702 702 702 ...
..- attr(*, "label")= chr "Country Identifier"
$ CNTSCHID : num [1:6606] 70200052 70200134 70200112 70200004 70200152 ...
..- attr(*, "label")= chr "Intl. School ID"
$ CNTSTUID : num [1:6606] 70200001 70200002 70200003 70200004 70200005 ...
..- attr(*, "label")= chr "Intl. Student ID"
$ CYC : chr [1:6606] "08MS" "08MS" "08MS" "08MS" ...
..- attr(*, "label")= chr "PISA Assessment Cycle (2 digits + 2 character Assessment type - MS/FT)"
$ NatCen : chr [1:6606] "070200" "070200" "070200" "070200" ...
..- attr(*, "label")= chr "National Centre 6-digit Code"
$ STRATUM : chr [1:6606] "SGP01" "SGP01" "SGP01" "SGP01" ...
..- attr(*, "label")= chr "Stratum ID 5-character (cnt + original stratum ID)"
$ SUBNATIO : chr [1:6606] "7020000" "7020000" "7020000" "7020000" ...
..- attr(*, "label")= chr "Adjudicated sub-region code 7-digit code (3-digit country code + region ID + stratum ID)"
$ REGION : num [1:6606] 70200 70200 70200 70200 70200 70200 70200 70200 70200 70200 ...
..- attr(*, "label")= chr "REGION"
$ OECD : num [1:6606] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "label")= chr "OECD country"
$ ADMINMODE : num [1:6606] 2 2 2 2 2 2 2 2 2 2 ...
..- attr(*, "label")= chr "Mode of Respondent"
$ LANGTEST_QQQ: num [1:6606] 313 313 313 313 313 313 313 313 313 313 ...
..- attr(*, "label")= chr "Language of Questionnaire"
$ LANGTEST_COG: num [1:6606] 313 313 313 313 313 313 313 313 313 313 ...
..- attr(*, "label")= chr "Language of Assessment"
$ LANGTEST_PAQ: num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Language of Parent Questionnaire"
$ Option_CT : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Creative Thinking Option"
$ Option_FL : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Financial Literacy Option"
$ Option_ICTQ : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "ICT Questionnaire Option"
$ Option_WBQ : num [1:6606] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "label")= chr "Well-Being Questionnaire Option"
$ Option_PQ : num [1:6606] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "label")= chr "Parent Questionnaire Option"
$ Option_TQ : num [1:6606] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "label")= chr "Teacher Questionnaire Option"
$ Option_UH : num [1:6606] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "label")= chr "Une Heure Option"
$ BOOKID : num [1:6606] 4 45 8 40 42 15 13 39 14 7 ...
..- attr(*, "label")= chr "Form Identifier"
$ ST001D01T : num [1:6606] 10 10 10 10 10 10 10 10 10 10 ...
..- attr(*, "label")= chr "Student International Grade (Derived)"
$ ST003D02T : num [1:6606] 10 6 7 2 9 9 3 4 8 6 ...
..- attr(*, "label")= chr "Student (Standardized) Birth - Month"
$ ST003D03T : num [1:6606] 2006 2006 2006 2006 2006 ...
..- attr(*, "label")= chr "Student (Standardized) Birth -Year"
$ ST004D01T : num [1:6606] 1 2 2 2 1 1 2 2 1 2 ...
..- attr(*, "label")= chr "Student (Standardized) Gender"
$ ST250Q01JA : num [1:6606] 2 1 1 2 2 2 1 1 2 2 ...
..- attr(*, "label")= chr "Which of the following are in your [home]: A room of your own"
$ ST250Q02JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Which of the following are in your [home]: A computer (laptop, desktop, or tablet) that you can use for school work"
$ ST250Q03JA : num [1:6606] 1 1 2 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Which of the following are in your [home]: Educational Software or Apps"
$ ST250Q04JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Which of the following are in your [home]: Your own [cell phone] with Internet access (e.g. smartphone)"
$ ST250Q05JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Which of the following are in your [home]: Internet access (e.g. Wi-fi) (excluding through smartphones)"
$ ST250D06JA : chr [1:6606] "7020002" "7020001" "7020001" "7020002" ...
..- attr(*, "label")= chr "Which of the following are in your home? <Country-specific item 1>"
$ ST250D07JA : chr [1:6606] "7020002" "7020001" "7020002" "7020002" ...
..- attr(*, "label")= chr "Which of the following are in your home? <Country-specific item 2>"
$ ST251Q01JA : num [1:6606] 2 1 2 1 2 2 2 1 3 3 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Cars, vans, or trucks"
$ ST251Q02JA : num [1:6606] 1 4 1 2 2 1 1 1 1 1 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Mopeds or motorcycles"
$ ST251Q03JA : num [1:6606] 3 3 3 3 2 2 3 3 4 3 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Rooms with a bath or shower"
$ ST251Q04JA : num [1:6606] 3 3 3 3 2 3 3 3 4 3 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Rooms with a [flush toilet]"
$ ST251Q06JA : num [1:6606] 3 4 2 2 1 2 2 3 4 1 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Musical instruments (e.g. guitar, piano, [country-specific example])"
$ ST251Q07JA : num [1:6606] 3 2 1 1 4 1 4 1 4 3 ...
..- attr(*, "label")= chr "How many of these items are there at your [home]: Works of art (e.g. paintings, sculptures, [country-specific example])"
$ ST251D08JA : chr [1:6606] "9999997" "9999997" "9999997" "9999997" ...
..- attr(*, "label")= chr "How many of these items are there at your home? <Country-specific item 1>"
$ ST251D09JA : chr [1:6606] "9999997" "9999997" "9999997" "9999997" ...
..- attr(*, "label")= chr "How many of these items are there at your home? <Country-specific item 2>"
$ ST253Q01JA : num [1:6606] 7 8 7 6 7 7 8 8 8 7 ...
..- attr(*, "label")= chr "How many [digital devices] with screens are there in your [home]?"
$ ST254Q01JA : num [1:6606] 2 3 2 2 2 2 2 2 2 2 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: Televisions"
$ ST254Q02JA : num [1:6606] 1 2 2 1 3 2 2 5 2 2 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: Desktop computers"
$ ST254Q03JA : num [1:6606] 3 2 2 2 2 2 3 3 3 4 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: Laptop computers or notebooks"
$ ST254Q04JA : num [1:6606] 2 3 2 1 1 2 2 3 3 2 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: Tablets (e.g. [iPad®], [BlackBerry® Playbook™])"
$ ST254Q05JA : num [1:6606] 1 5 1 1 NA 1 1 5 2 2 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: E-book readers (e.g. [Kindle™], [Kobo], [Bookeen])"
$ ST254Q06JA : num [1:6606] 3 2 3 3 4 2 4 3 4 4 ...
..- attr(*, "label")= chr "How many of the following [digital devices] are in your [home]: [Cell phones] with Internet access (i.e. smartphones)"
$ ST255Q01JA : num [1:6606] 7 4 4 3 2 2 4 5 7 4 ...
..- attr(*, "label")= chr "How many books are there in your [home]?"
$ ST256Q01JA : num [1:6606] 2 4 5 2 4 1 1 3 4 4 ...
..- attr(*, "label")= chr "How many of these books at [home]: Religious books (e.g. [Bible], [Example 2])"
$ ST256Q02JA : num [1:6606] 2 5 2 1 1 2 1 5 4 2 ...
..- attr(*, "label")= chr "How many of these books at [home]: Classical literature (e.g. [Shakespeare], [Example 2])"
$ ST256Q03JA : num [1:6606] 4 5 2 1 1 2 2 5 5 1 ...
..- attr(*, "label")= chr "How many of these books at [home]: Contemporary literature"
$ ST256Q06JA : num [1:6606] 4 3 3 2 2 5 2 4 4 1 ...
..- attr(*, "label")= chr "How many of these books at [home]: Books on science"
$ ST256Q07JA : num [1:6606] 3 5 5 2 2 5 1 4 3 1 ...
..- attr(*, "label")= chr "How many of these books at [home]: Books on art, music, or design"
$ ST256Q08JA : num [1:6606] 3 3 3 1 1 5 1 5 5 2 ...
..- attr(*, "label")= chr "How many of these books at [home]: [Technical reference books]"
$ ST256Q09JA : num [1:6606] 2 2 4 2 2 2 2 2 2 3 ...
..- attr(*, "label")= chr "How many of these books at [home]: Dictionaries"
$ ST256Q10JA : num [1:6606] 4 4 5 2 4 1 4 4 4 3 ...
..- attr(*, "label")= chr "How many of these books at [home]: Books to help with your school work"
$ ST230Q01JA : num [1:6606] 4 4 2 4 4 3 2 2 3 4 ...
..- attr(*, "label")= chr "How many siblings (including brothers, sisters, step-brothers, and step-sisters) do you have?"
$ ST005Q01JA : num [1:6606] 2 2 2 2 2 2 2 2 2 2 ...
..- attr(*, "label")= chr "What is the [highest level of schooling] completed by your mother?"
$ ST006Q01JA : num [1:6606] 2 2 2 2 2 2 2 NA 2 2 ...
..- attr(*, "label")= chr "Does your mother have any of the following qualifications: [ISCED level 8]"
$ ST006Q02JA : num [1:6606] 2 2 2 2 2 2 2 1 2 2 ...
..- attr(*, "label")= chr "Does your mother have any of the following qualifications: [ISCED level 7]"
$ ST006Q03JA : num [1:6606] 1 2 2 2 2 2 2 1 1 1 ...
..- attr(*, "label")= chr "Does your mother have any of the following qualifications: [ISCED level 6]"
$ ST006Q04JA : num [1:6606] 2 1 2 2 1 1 2 1 1 2 ...
..- attr(*, "label")= chr "Does your mother have any of the following qualifications: [ISCED level 5]"
$ ST006Q05JA : num [1:6606] 1 1 2 1 2 1 1 1 1 1 ...
..- attr(*, "label")= chr "Does your mother have any of the following qualifications: [ISCED level 4]"
$ ST007Q01JA : num [1:6606] 2 2 2 4 2 2 4 2 2 2 ...
..- attr(*, "label")= chr "What is the [highest level of schooling] completed by your father?"
$ ST008Q01JA : num [1:6606] 2 2 2 NA 2 2 2 NA 2 2 ...
..- attr(*, "label")= chr "Does your father have any of the following qualifications: [ISCED level 8]"
$ ST008Q02JA : num [1:6606] 2 2 2 NA 2 1 2 NA 2 2 ...
..- attr(*, "label")= chr "Does your father have any of the following qualifications: [ISCED level 7]"
$ ST008Q03JA : num [1:6606] 2 2 2 NA 2 1 2 1 1 2 ...
..- attr(*, "label")= chr "Does your father have any of the following qualifications: [ISCED level 6]"
$ ST008Q04JA : num [1:6606] 1 1 2 NA 1 1 2 1 1 1 ...
..- attr(*, "label")= chr "Does your father have any of the following qualifications: [ISCED level 5]"
$ ST008Q05JA : num [1:6606] 2 1 2 1 2 1 2 1 1 2 ...
..- attr(*, "label")= chr "Does your father have any of the following qualifications: [ISCED level 4]"
$ ST258Q01JA : num [1:6606] 1 1 1 5 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "In the past 30 days, how often did you not eat because there was not enough money to buy food?"
$ ST259Q01JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Now think about where you would place your family on this scale. Where would you say your family stands at this time?"
$ ST259Q02JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Now think about where you would place your family on this scale. Where do you think you will stand when you are 30?"
$ ST019AQ01T : num [1:6606] 1 1 1 1 1 2 1 2 1 1 ...
..- attr(*, "label")= chr "In what country were you and your parents born? You"
$ ST019BQ01T : num [1:6606] 1 1 2 1 1 2 1 2 1 1 ...
..- attr(*, "label")= chr "In what country were you and your parents born? Mother"
$ ST019CQ01T : num [1:6606] 1 1 1 1 1 2 1 2 1 1 ...
..- attr(*, "label")= chr "In what country were you and your parents born? Father"
$ ST021Q01TA : num [1:6606] NA NA NA NA NA 1 NA 1 NA NA ...
..- attr(*, "label")= chr "How old were you when you arrived in [country of test]?"
$ ST022Q01TA : num [1:6606] 1 1 2 2 1 2 1 2 1 2 ...
..- attr(*, "label")= chr "What language do you speak at home most of the time?"
$ ST226Q01JA : num [1:6606] 1 1 1 1 1 1 1 4 1 1 ...
..- attr(*, "label")= chr "How long have you been enrolled at this school?"
$ ST125Q01NA : num [1:6606] 3 5 8 4 8 3 4 8 8 4 ...
..- attr(*, "label")= chr "How old were you when you started [ISCED 0]: Years"
$ ST126Q01TA : num [1:6606] 4 5 5 5 5 5 5 5 4 5 ...
..- attr(*, "label")= chr "How old were you when you started [ISCED 1]: Years"
$ ST127Q01TA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever repeated a [grade]: At [ISCED 1]"
$ ST127Q02TA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever repeated a [grade]: At [ISCED 2]"
$ ST127Q03TA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever repeated a [grade]: At [ISCED 3]"
$ ST260Q01JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever missed school for more than three months in a row: At [ISCED 1]"
$ ST260Q02JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever missed school for more than three months in a row: At [ISCED 2]"
$ ST260Q03JA : num [1:6606] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "label")= chr "Have you ever missed school for more than three months in a row: At [ISCED 3]"
$ ST261Q01JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I was bored."
$ ST261Q02JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I was suspended for something (e.g. violence, aggression, use of drugs, drug dealing)."
$ ST261Q03JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I was pregnant."
$ ST261Q04JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I could not reach school because of transportation problems."
$ ST261Q05JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I had to take care of a family member."
$ ST261Q06JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I had to help with work at home, the family business, or on the family land."
$ ST261Q07JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I had to get work to bring money home."
$ ST261Q08JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I was sick."
$ ST261Q09JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I did not feel safe at school."
$ ST261Q10JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: I could not pay [school fees]."
$ ST261Q11JA : num [1:6606] NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Why miss school for 3+ months: School was closed because of a natural disaster (e.g. flood, earthquake)."
$ ST062Q01TA : num [1:6606] 1 1 1 1 2 1 1 1 1 1 ...
..- attr(*, "label")= chr "In the last two full weeks of school, how often: I [skipped] a whole school day"
[list output truncated]
The dataset contains the Intl. School ID (CNTSCHID), Intl. Student ID (CNTSTUID), and Student (Standardized) Gender (ST004D01T) variables, which are currently kept as numeric data types. We will convert these to categorical data types due to the following reasons:
The International School ID (CNTSCHID) is a numerical designation used to uniquely identify various schools. The numbers lack inherent mathematical significance; they serve solely as designations.
Comparable to school IDs, The International Student ID (CNTSTUID) is a distinct identity assigned to each student, it should be regarded as a label.
“Student (Standardized) Gender (ST004D01T)” represents gender and is characterized by discrete categories such as male or female rather than a numerical scale.
stu_qqq_Sg$CNTSCHID <- as.factor(stu_qqq_Sg$CNTSCHID)
stu_qqq_Sg$CNTSTUID <- as.factor(stu_qqq_Sg$CNTSTUID)
stu_qqq_Sg$ST004D01T <- as.factor(stu_qqq_Sg$ST004D01T)Proceed to check for duplicates
duplicate_rows <- stu_qqq_Sg[duplicated(stu_qqq_Sg),]
print(head(duplicate_rows))# A tibble: 0 × 1,279
# ℹ 1,279 variables: CNT <chr>, CNTRYID <dbl>, CNTSCHID <fct>, CNTSTUID <fct>,
# CYC <chr>, NatCen <chr>, STRATUM <chr>, SUBNATIO <chr>, REGION <dbl>,
# OECD <dbl>, ADMINMODE <dbl>, LANGTEST_QQQ <dbl>, LANGTEST_COG <dbl>,
# LANGTEST_PAQ <dbl>, Option_CT <dbl>, Option_FL <dbl>, Option_ICTQ <dbl>,
# Option_WBQ <dbl>, Option_PQ <dbl>, Option_TQ <dbl>, Option_UH <dbl>,
# BOOKID <dbl>, ST001D01T <dbl>, ST003D02T <dbl>, ST003D03T <dbl>,
# ST004D01T <fct>, ST250Q01JA <dbl>, ST250Q02JA <dbl>, ST250Q03JA <dbl>, …
The output # A tibble: 0 × 1,279 indicates that there are zero rows in the resulting tibble. This means that no duplicate rows were found in your dataset stu_qqq_Sg across all 1,279 variables. Each row in your dataset is unique when considering all the variables together.
# Count the total number of missing values in the dataset
total_na <- sum(is.na(stu_qqq_Sg))
print(total_na)[1] 4168500
# Count the number of missing values per column
na_per_column <- colSums(is.na(stu_qqq_Sg))
print(na_per_column) CNT CNTRYID CNTSCHID CNTSTUID CYC NatCen
0 0 0 0 0 0
STRATUM SUBNATIO REGION OECD ADMINMODE LANGTEST_QQQ
0 0 0 0 0 39
LANGTEST_COG LANGTEST_PAQ Option_CT Option_FL Option_ICTQ Option_WBQ
0 6606 6606 6606 0 0
Option_PQ Option_TQ Option_UH BOOKID ST001D01T ST003D02T
0 0 0 0 0 0
ST003D03T ST004D01T ST250Q01JA ST250Q02JA ST250Q03JA ST250Q04JA
0 0 66 45 60 46
ST250Q05JA ST250D06JA ST250D07JA ST251Q01JA ST251Q02JA ST251Q03JA
46 0 0 47 78 51
ST251Q04JA ST251Q06JA ST251Q07JA ST251D08JA ST251D09JA ST253Q01JA
62 47 42 0 0 41
ST254Q01JA ST254Q02JA ST254Q03JA ST254Q04JA ST254Q05JA ST254Q06JA
134 188 176 117 120 58
ST255Q01JA ST256Q01JA ST256Q02JA ST256Q03JA ST256Q06JA ST256Q07JA
44 359 347 357 318 339
ST256Q08JA ST256Q09JA ST256Q10JA ST230Q01JA ST005Q01JA ST006Q01JA
377 299 278 43 86 892
ST006Q02JA ST006Q03JA ST006Q04JA ST006Q05JA ST007Q01JA ST008Q01JA
752 545 686 657 103 1034
ST008Q02JA ST008Q03JA ST008Q04JA ST008Q05JA ST258Q01JA ST259Q01JA
794 638 748 736 46 6606
ST259Q02JA ST019AQ01T ST019BQ01T ST019CQ01T ST021Q01TA ST022Q01TA
6606 142 64 76 5266 42
ST226Q01JA ST125Q01NA ST126Q01TA ST127Q01TA ST127Q02TA ST127Q03TA
46 56 52 63 75 95
ST260Q01JA ST260Q02JA ST260Q03JA ST261Q01JA ST261Q02JA ST261Q03JA
59 71 80 6313 6316 6319
ST261Q04JA ST261Q05JA ST261Q06JA ST261Q07JA ST261Q08JA ST261Q09JA
6317 6318 6319 6322 6303 6316
ST261Q10JA ST261Q11JA ST062Q01TA ST062Q02TA ST062Q03TA ST267Q01JA
6319 6320 60 59 49 2459
ST267Q02JA ST267Q03JA ST267Q04JA ST267Q05JA ST267Q06JA ST267Q07JA
2534 2498 2524 2484 2567 2564
ST267Q08JA ST034Q01TA ST034Q02TA ST034Q03TA ST034Q04TA ST034Q05TA
2447 1139 1087 1164 1174 1161
ST034Q06TA ST038Q03NA ST038Q04NA ST038Q05NA ST038Q06NA ST038Q07NA
1147 53 49 60 50 51
ST038Q08NA ST038Q09JA ST038Q10JA ST038Q11JA ST265Q01JA ST265Q02JA
58 49 49 47 43 46
ST265Q03JA ST265Q04JA ST266Q01JA ST266Q02JA ST266Q03JA ST266Q04JA
44 43 52 51 53 52
ST266Q05JA ST294Q01JA ST294Q02JA ST294Q03JA ST294Q04JA ST294Q05JA
53 50 97 73 58 50
ST295Q01JA ST295Q02JA ST295Q03JA ST295Q04JA ST295Q05JA ST326Q01JA
69 79 108 106 75 57
ST326Q02JA ST326Q03JA ST326Q04JA ST326Q05JA ST326Q06JA ST326Q07JA
112 98 78 78 71 6606
ST326Q08JA ST326Q09JA ST326Q10JA ST326Q11JA ST326Q12JA ST322Q01JA
6606 6606 6606 6606 6606 1103
ST322Q02JA ST322Q03JA ST322Q04JA ST322Q06JA ST322Q07JA ST307Q01JA
1143 1156 1182 1212 1111 3320
ST307Q02JA ST307Q03JA ST307Q04JA ST307Q05JA ST307Q06JA ST307Q07JA
3274 3295 3302 3346 3378 3306
ST307Q08JA ST307Q09JA ST307Q10JA ST309Q01JA ST309Q02JA ST309Q03JA
3377 3335 3370 3371 3288 3354
ST309Q04JA ST309Q05JA ST309Q06JA ST309Q07JA ST309Q08JA ST309Q09JA
3252 3349 3354 3387 3254 3372
ST309Q10JA ST301Q01JA ST301Q02JA ST301Q03JA ST301Q04JA ST301Q05JA
3313 3327 3338 3372 3304 3307
ST301Q06JA ST301Q07JA ST301Q08JA ST301Q09JA ST301Q10JA ST343Q01JA
3348 3258 3358 3400 3283 3350
ST343Q02JA ST343Q03JA ST343Q04JA ST343Q05JA ST343Q06JA ST343Q07JA
3295 3297 3395 3346 3393 3373
ST343Q08JA ST343Q09JA ST343Q10JA ST311Q01JA ST311Q02JA ST311Q03JA
3257 3317 3304 3265 3389 3309
ST311Q04JA ST311Q05JA ST311Q06JA ST311Q07JA ST311Q08JA ST311Q09JA
3287 3346 3420 3378 3342 3302
ST311Q10JA ST315Q01JA ST315Q02JA ST315Q03JA ST315Q04JA ST315Q05JA
3274 3332 3320 3328 3355 3280
ST315Q06JA ST315Q07JA ST315Q08JA ST315Q09JA ST315Q10JA ST303Q01JA
3361 3349 3318 3341 3342 2508
ST303Q02JA ST303Q03JA ST303Q04JA ST303Q05JA ST303Q06JA ST303Q07JA
2525 2537 2512 2494 2552 2499
ST303Q08JA ST305Q01JA ST305Q02JA ST305Q03JA ST305Q04JA ST305Q05JA
2493 3283 3301 3312 3406 3370
ST305Q06JA ST305Q07JA ST305Q08JA ST305Q09JA ST305Q10JA ST345Q01JA
3342 3394 3321 3329 3326 3362
ST345Q02JA ST345Q03JA ST345Q04JA ST345Q05JA ST345Q06JA ST345Q07JA
3294 3275 3349 3425 3311 3398
ST345Q08JA ST345Q09JA ST345Q10JA ST313Q01JA ST313Q02JA ST313Q03JA
3311 3309 3305 3271 3288 3328
ST313Q04JA ST313Q05JA ST313Q06JA ST313Q07JA ST313Q08JA ST313Q09JA
3336 3352 3303 3383 3394 3386
ST313Q10JA ST263Q02JA ST263Q04JA ST263Q06JA ST263Q08JA ST016Q01NA
3297 63 61 63 6606 6606
ST059Q01TA ST059Q02JA ST296Q01JA ST296Q02JA ST296Q03JA ST296Q04JA
109 175 70 77 87 78
ST272Q01JA ST273Q01JA ST273Q02JA ST273Q03JA ST273Q04JA ST273Q05JA
6606 1913 1984 1958 1866 1955
ST273Q06JA ST273Q07JA ST270Q01JA ST270Q02JA ST270Q03JA ST270Q04JA
1953 1925 68 67 68 71
ST285Q01JA ST285Q02JA ST285Q03JA ST285Q04JA ST285Q05JA ST285Q06JA
2944 3033 2974 3006 2960 2934
ST285Q07JA ST285Q08JA ST285Q09JA ST283Q01JA ST283Q02JA ST283Q03JA
2975 2974 3018 2984 2983 2987
ST283Q04JA ST283Q05JA ST283Q06JA ST283Q07JA ST283Q08JA ST283Q09JA
2977 2988 2985 2954 3027 2989
ST275Q01WA ST275Q02WA ST275Q03WA ST275Q04WA ST275Q05WA ST275Q06WA
2951 3000 3017 2994 2994 2979
ST275Q07WA ST275Q08WA ST275Q09WA ST276Q01JA ST276Q02JA ST276Q03JA
3033 2925 2994 3321 3360 3385
ST276Q04JA ST276Q05JA ST276Q06JA ST276Q07JA ST276Q08JA ST276Q09JA
3353 3379 3364 3398 3252 3336
ST276Q10JA ST268Q01JA ST268Q02JA ST268Q03JA ST268Q04JA ST268Q05JA
3336 69 74 79 73 71
ST268Q06JA ST268Q07JA ST268Q08JA ST268Q09JA ST290Q01WA ST290Q02WA
82 66 71 65 2963 3023
ST290Q03WA ST290Q04WA ST290Q05WA ST290Q06WA ST290Q07WA ST290Q08WA
3078 2925 2938 2981 2976 2985
ST290Q09WA ST291Q01JA ST291Q02JA ST291Q03JA ST291Q04JA ST291Q05JA
2982 3337 3329 3341 3353 3376
ST291Q06JA ST291Q07JA ST291Q08JA ST291Q09JA ST291Q10JA ST289Q01WA
3348 3324 3320 3343 3399 3859
ST289Q02JA ST289Q03WA ST289Q04JA ST289Q05WA ST289Q06JA ST289Q07JA
3774 3945 3909 3895 3907 3863
ST289Q08WA ST289Q09WA ST289Q10WA ST289Q11WA ST289Q14JA ST293Q01JA
3884 3865 3937 3950 3909 3036
ST293Q02JA ST293Q03JA ST293Q04JA ST293Q05JA ST293Q06JA ST293Q07JA
3028 2983 3001 2921 2933 2998
ST293Q08JA ST293Q09JA ST292Q01JA ST292Q02JA ST292Q03JA ST292Q04JA
2919 3051 1198 1187 1183 1186
ST292Q05JA ST292Q06JA ST297Q01JA ST297Q03JA ST297Q05JA ST297Q06JA
1158 1118 61 61 61 61
ST297Q07JA ST297Q09JA ST334Q01JA ST334Q02JA ST334Q03JA ST334Q04JA
61 61 6606 6606 6606 6606
ST334Q05JA ST334Q06JA ST334Q07JA ST334Q08JA ST334Q09JA ST334Q10JA
6606 6606 6606 6606 6606 6606
ST335Q01JA ST335Q02JA ST335Q03JA ST335Q05JA ST335Q06JA ST335Q07JA
6606 6606 6606 6606 6606 6606
ST336Q01JA ST336Q03JA ST336Q04JA ST336Q05JA ST336Q06JA ST336Q07JA
6606 6606 6606 6606 6606 6606
ST337Q01JA ST337Q02JA ST337Q03JA ST337Q04JA ST337Q05JA ST337Q06JA
6606 6606 6606 6606 6606 6606
ST337Q07JA ST337Q08JA ST338Q01JA ST338Q02JA ST338Q03JA ST338Q04JA
6606 6606 6606 6606 6606 6606
ST338Q05JA ST338Q06JA ST338Q07JA ST338Q08JA ST339Q04JA ST339Q06JA
6606 6606 6606 6606 6606 6606
ST340Q01JA ST340Q02JA ST340Q03JA ST340Q04JA ST340Q05JA ST340Q06JA
6606 6606 6606 6606 6606 6606
ST340Q07JA ST340Q08JA ST340Q09JA ST340Q10JA ST341Q01JA ST341Q02JA
6606 6606 6606 6606 6606 6606
ST341Q03JA ST341Q04JA ST341Q05JA ST342Q01JA ST342Q02JA ST342Q03JA
6606 6606 6606 6606 6606 6606
ST342Q04JA ST342Q06JA ST342Q07JA ST342Q08JA ST300Q01JA ST300Q02JA
6606 6606 6606 6606 3477 3335
ST300Q03JA ST300Q04JA ST300Q05JA ST300Q06JA ST300Q07JA ST300Q08JA
3315 3351 3364 3357 3376 3279
ST300Q09JA ST300Q10JA ST327Q01JA ST327Q02JA ST327Q03JA ST327Q04JA
3410 3327 223 193 6606 265
ST327Q05JA ST327Q06JA ST327Q07JA ST327Q08JA ST330Q01WA ST330Q02WA
410 226 404 593 3648 3669
ST330Q03WA ST330Q04WA ST330Q05WA ST330Q06WA ST330Q07WA ST330Q08WA
3563 3699 3637 3671 3773 3670
ST330Q09WA ST330Q11WA ST330D10WA ST324Q02JA ST324Q04JA ST324Q05JA
3682 3676 0 2987 3029 3034
ST324Q07JA ST324Q10JA ST324Q11JA ST324Q12JA ST324Q13JA ST324Q14JA
2964 3027 3063 3051 2979 2969
ST347Q01JA ST347Q02JA ST348Q01JA ST348Q02JA ST348Q03JA ST348Q04JA
6606 6606 6606 6606 6606 6606
ST348Q05JA ST348Q06JA ST348Q07JA ST348Q08JA ST349Q01JA ST350Q01JA
6606 6606 6606 6606 6606 6606
ST351Q01JA ST351Q02JA ST351Q03JA ST351Q04JA ST351Q05JA ST351Q06JA
6606 6606 6606 6606 6606 6606
ST351Q07JA ST351Q08JA ST352Q01JA ST352Q02JA ST352Q03JA ST352Q04JA
6606 6606 6606 6606 6606 6606
ST352Q05JA ST352Q06JA ST352Q07JA ST352Q08JA ST353Q01JA ST353Q02JA
6606 6606 6606 6606 6606 6606
ST353Q03JA ST353Q04JA ST353Q05JA ST353Q06JA ST353Q07JA ST353Q08JA
6606 6606 6606 6606 6606 6606
ST354Q01JA ST354Q02JA ST354Q03JA ST354Q04JA ST354Q05JA ST354Q06JA
6606 6606 6606 6606 6606 6606
ST354Q07JA ST354Q08JA ST354Q09JA ST354Q10JA ST355Q01JA ST355Q02JA
6606 6606 6606 6606 6606 6606
ST355Q03JA ST355Q04JA ST355Q05JA ST355Q06JA ST355Q07JA ST355Q08JA
6606 6606 6606 6606 6606 6606
ST356Q01JA ST331Q01JA ST331Q02JA ST331Q03JA FL150Q01TA FL150Q02TA
6606 134 126 125 6606 6606
FL150Q03TA FL164Q01HA FL164Q02HA FL164Q03HA FL164Q04HA FL164Q05HA
6606 6606 6606 6606 6606 6606
FL164Q06HA FL164Q07HA FL164Q08HA FL164Q09HA FL164Q10HA FL164Q11HA
6606 6606 6606 6606 6606 6606
FL164Q12HA FL164Q13HA FL164Q14HA FL164Q15HA FL164Q16HA FL166Q01HA
6606 6606 6606 6606 6606 6606
FL166Q02HA FL166Q03HA FL166Q05HA FL166Q06HA FL166Q07HA FL174Q01JA
6606 6606 6606 6606 6606 6606
FL174Q02JA FL174Q03JA FL174Q04JA FL174Q05JA FL174Q06JA FL174Q07JA
6606 6606 6606 6606 6606 6606
FL167Q01HA FL167Q02HA FL167Q06JA FL167Q03HA FL167Q04HA FL167Q05HA
6606 6606 6606 6606 6606 6606
FL167Q07JA FL170Q01JA FL170Q02JA FL170Q03JA FL170Q04JA FL170Q05JA
6606 6606 6606 6606 6606 6606
FL170Q06JA FL170Q07JA FL159Q01HA FL159Q02HA FL159Q03HA FL159Q04HA
6606 6606 6606 6606 6606 6606
FL160Q01HA FL160Q02HA FL160Q03HA FL160Q04HA FL161Q01HA FL161Q02HA
6606 6606 6606 6606 6606 6606
FL161Q03HA FL162Q01HA FL162Q02HA FL162Q03HA FL162Q04HA FL162Q05HA
6606 6606 6606 6606 6606 6606
FL162Q06HA FL163Q01HA FL163Q02HA FL163Q03HA FL163Q04HA FL163Q05HA
6606 6606 6606 6606 6606 6606
FL171Q01JA FL171Q02JA FL171Q03JA FL171Q04JA FL171Q05JA FL171Q07JA
6606 6606 6606 6606 6606 6606
FL171Q08JA FL171Q09JA FL171Q10JA FL171Q11JA FL171Q12JA FL169Q01HA
6606 6606 6606 6606 6606 6606
FL169Q05JA FL169Q02HA FL169Q06JA FL169Q07JA FL169Q03HA FL169Q04HA
6606 6606 6606 6606 6606 6606
FL169Q08JA FL169Q10JA FL169Q11JA FL172Q01JA FL172Q03JA FL172Q05JA
6606 6606 6606 6606 6606 6606
FL172Q06JA IC170Q01JA IC170Q02JA IC170Q03JA IC170Q04JA IC170Q05JA
6606 151 149 137 154 158
IC170Q06JA IC170Q07JA IC171Q01JA IC171Q02JA IC171Q03JA IC171Q04JA
141 132 151 157 151 163
IC171Q05JA IC171Q06JA IC172Q01JA IC172Q02JA IC172Q03JA IC172Q04JA
143 141 141 149 143 156
IC172Q05JA IC172Q06JA IC172Q07JA IC172Q08JA IC172Q09JA IC173Q01JA
151 164 143 147 147 142
IC173Q02JA IC173Q03JA IC173Q04JA IC174Q01JA IC174Q02JA IC174Q03JA
150 155 142 164 176 187
IC174Q04JA IC174Q05JA IC174Q06JA IC174Q07JA IC174Q08JA IC174Q09JA
173 179 179 184 185 176
IC174Q10JA IC175Q01JA IC175Q02JA IC175Q03JA IC175Q05JA IC176Q01JA
169 162 170 166 167 168
IC176Q02JA IC176Q03JA IC176Q04JA IC176Q05JA IC176Q06JA IC176Q07JA
184 199 176 178 173 175
IC176Q08JA IC184Q01JA IC184Q02JA IC184Q03JA IC184Q04JA IC177Q01JA
172 170 177 175 174 173
IC177Q02JA IC177Q03JA IC177Q04JA IC177Q05JA IC177Q06JA IC177Q07JA
199 207 196 183 181 180
IC178Q01JA IC178Q02JA IC178Q03JA IC178Q04JA IC178Q05JA IC178Q06JA
206 231 226 224 207 215
IC178Q07JA IC179Q01JA IC179Q02JA IC179Q03JA IC179Q04JA IC179Q05JA
211 190 202 198 202 195
IC179Q06JA IC180Q01JA IC180Q02JA IC180Q03JA IC180Q04JA IC180Q05JA
192 194 216 205 205 199
IC180Q06JA IC180Q07JA IC180Q08JA IC181Q01JA IC181Q02JA IC181Q03JA
209 200 203 6606 6606 6606
IC181Q04JA IC182Q01JA IC182Q02JA IC182Q03JA IC183Q01JA IC183Q02JA
6606 190 188 190 202 223
IC183Q03JA IC183Q04JA IC183Q05JA IC183Q07JA IC183Q08JA IC183Q09JA
221 233 212 233 211 233
IC183Q10JA IC183Q12JA IC183Q13JA IC183Q14JA IC183Q15JA IC183Q16JA
213 226 226 210 213 207
WB150Q01HA WB151Q01HA WB152Q01HA WB153Q01HA WB153Q02HA WB153Q03HA
6606 6606 6606 6606 6606 6606
WB153Q04HA WB153Q05HA WB154Q01HA WB154Q02HA WB154Q03HA WB154Q04HA
6606 6606 6606 6606 6606 6606
WB154Q05HA WB154Q06HA WB154Q07HA WB154Q08HA WB154Q09HA WB155Q01HA
6606 6606 6606 6606 6606 6606
WB155Q02HA WB155Q03HA WB155Q04HA WB155Q05HA WB155Q06HA WB155Q07HA
6606 6606 6606 6606 6606 6606
WB155Q08HA WB155Q09HA WB155Q10HA WB156Q01HA WB158Q01HA WB160Q01HA
6606 6606 6606 6606 6606 6606
WB161Q01HA WB162Q01HA WB162Q02HA WB162Q03HA WB162Q04HA WB162Q05HA
6606 6606 6606 6606 6606 6606
WB162Q06HA WB162Q07HA WB162Q08HA WB162Q09HA WB163Q01HA WB163Q02HA
6606 6606 6606 6606 6606 6606
WB163Q03HA WB163Q04HA WB163Q05HA WB163Q06HA WB163Q07HA WB163Q08HA
6606 6606 6606 6606 6606 6606
WB164Q01HA WB165Q01HA WB166Q01HA WB166Q02HA WB166Q03HA WB166Q04HA
6606 6606 6606 6606 6606 6606
WB167Q01HA WB168Q01HA WB168Q02HA WB168Q03HA WB168Q04HA WB171Q01HA
6606 6606 6606 6606 6606 6606
WB171Q02HA WB171Q03HA WB171Q04HA WB172Q01HA WB173Q01HA WB173Q02HA
6606 6606 6606 6606 6606 6606
WB173Q03HA WB173Q04HA WB176Q01HA WB177Q01HA WB177Q02HA WB177Q03HA
6606 6606 6606 6606 6606 6606
WB177Q04HA WB032Q01NA WB032Q02NA WB031Q01NA WB178Q01HA WB178Q02HA
6606 6606 6606 6606 6606 6606
WB178Q03HA WB178Q04HA WB178Q05HA WB178Q06HA WB178Q07HA PA001Q01TA
6606 6606 6606 6606 6606 6606
PA001Q02TA PA001Q03TA PA003Q01TA PA003Q02TA PA003Q03TA PA003Q05IA
6606 6606 6606 6606 6606 6606
PA003Q18WA PA003Q19WA PA003Q20WA PA003Q11JA PA003Q12JA PA003Q13JA
6606 6606 6606 6606 6606 6606
PA003Q14JA PA003Q15JA PA003Q16JA PA003Q17JA PA196Q01WA PA196Q02WA
6606 6606 6606 6606 6606 6606
PA196Q03WA PA196Q04WA PA197Q01WA PA197Q02WA PA197Q03WA PA197Q04WA
6606 6606 6606 6606 6606 6606
PA197Q05WA PA008Q01TA PA008Q02TA PA008Q03TA PA008Q04TA PA008Q05TA
6606 6606 6606 6606 6606 6606
PA008Q06NA PA008Q07NA PA008Q08NA PA008Q09NA PA008Q10NA PA009Q01NA
6606 6606 6606 6606 6606 6606
PA009Q02NA PA009Q03NA PA009Q04NA PA009Q05NA PA009Q06NA PA009Q07NA
6606 6606 6606 6606 6606 6606
PA009Q08NA PA009Q09NA PA009Q10NA PA009Q11NA PA007Q01TA PA007Q02TA
6606 6606 6606 6606 6606 6606
PA007Q03TA PA007Q04TA PA007Q05TA PA007Q06TA PA007Q07TA PA007Q09NA
6606 6606 6606 6606 6606 6606
PA007Q11NA PA007Q12NA PA007Q13NA PA007Q14NA PA007Q15NA PA005Q01TA
6606 6606 6606 6606 6606 6606
PA006Q01TA PA006Q02TA PA006Q03TA PA006Q04TA PA006Q05TA PA006Q06TA
6606 6606 6606 6606 6606 6606
PA006Q07TA PA006Q08TA PA006Q09TA PA006Q10TA PA006Q11TA PA006Q12HA
6606 6606 6606 6606 6606 6606
PA006Q13HA PA006Q14HA PA166Q01HA PA167Q02HA PA167Q03HA PA167Q04HA
6606 6606 6606 6606 6606 6606
PA167Q05HA PA183Q01JA PA183Q02JA PA183Q03JA PA183Q04JA PA183Q05JA
6606 6606 6606 6606 6606 6606
PA183Q06JA PA183Q07JA PA183Q08JA PA018Q01NA PA018Q02NA PA018Q03NA
6606 6606 6606 6606 6606 6606
PA177Q01HA PA177Q02HA PA177Q03HA PA177Q04HA PA177Q05HA PA177Q06HA
6606 6606 6606 6606 6606 6606
PA177Q07HA PA177Q08HA PA180Q01HA PA182Q01HA PA175Q01HA PA175Q02HA
6606 6606 6606 6606 6606 6606
PA175Q03JA PA175Q04JA PA185Q01JA PA185Q02JA PA185Q03JA PA185Q04JA
6606 6606 6606 6606 6606 6606
PA185Q05JA PA185Q07JA PA185Q08JA PA185Q09JA PA185Q10JA PA186Q01JA
6606 6606 6606 6606 6606 6606
PA186Q02JA PA186Q03JA PA186Q04JA PA186Q05JA PA186Q06JA PA186Q07JA
6606 6606 6606 6606 6606 6606
PA186Q08JA PA187Q04JA PA187Q06JA PA188Q01JA PA188Q02JA PA188Q03JA
6606 6606 6606 6606 6606 6606
PA188Q04JA PA188Q05JA PA188Q06JA PA188Q08JA PA188Q09JA PA188Q10JA
6606 6606 6606 6606 6606 6606
PA189Q01JA PA189Q02JA PA189Q03JA PA189Q04JA PA189Q05JA PA189Q06JA
6606 6606 6606 6606 6606 6606
PA189Q09JA PA189Q10JA PA194Q01JA PA195Q01JA PA041Q01TA PA042Q01TA
6606 6606 6606 6606 6606 6606
EFFORT1 EFFORT2 OCOD1 OCOD2 OCOD3 PROGN
1054 1106 0 0 0 0
AGE GRADE ISCEDP IMMIG COBN_S COBN_M
0 0 0 236 0 0
COBN_F LANGN REPEAT MISSSC SKIPPING TARDYSD
0 0 48 53 54 49
EXERPRAC STUDYHMW WORKPAY WORKHOME EXPECEDU MATHPREF
47 46 51 51 209 69
MATHEASE MATHMOT DURECEC BSMJ SISCO RELATST
75 67 2094 1379 259 48
BELONG BULLIED FEELSAFE SCHRISK PERSEVAGR CURIOAGR
49 46 43 51 92 103
COOPAGR EMPATAGR ASSERAGR STRESAGR EMOCOAGR GROSAGR
98 97 126 147 136 6606
INFOSEEK FAMSUP DISCLIM TEACHSUP COGACRCO COGACMCO
6606 105 63 68 71 84
EXPOFA EXPO21ST MATHEFF MATHEF21 FAMCON ANXMAT
80 83 77 80 90 82
MATHPERS CREATEFF CREATSCH CREATFAM CREATAS CREATOOS
83 6606 6606 6606 6606 6606
CREATOP OPENART IMAGINE SCHSUST LEARRES PROBSELF
6606 6606 6606 6606 6606 6606
FAMSUPSL FEELLAH SDLEFF MISCED FISCED HISCED
6606 6606 6606 74 97 57
PAREDINT BMMJ1 BFMJ2 HISEI ICTRES HOMEPOS
57 607 849 310 40 40
ESCS FCFMLRTY FLSCHOOL FLMULTSB FLFAMILY ACCESSFP
47 6606 6606 6606 6606 6606
FLCONFIN FLCONICT ACCESSFA ATTCONFM FRINFLFM ICTSCH
6606 6606 6606 6606 6606 131
ICTAVSCH ICTHOME ICTAVHOM ICTQUAL ICTSUBJ ICTENQ
119 138 131 139 186 155
ICTFEED ICTOUT ICTWKDY ICTWKEND ICTREG ICTINFO
163 157 173 204 189 193
ICTDISTR ICTEFFIC STUBMI BODYIMA SOCONPA LIFESAT
6606 258 6606 6606 6606 6606
PSYCHSYM SOCCON EXPWB CURSUPP PQMIMP PQMCAR
6606 6606 6606 6606 6606 6606
PARINVOL PQSCHOOL PASCHPOL ATTIMMP PAREXPT CREATHME
6606 6606 6606 6606 6606 6606
CREATACT CREATOPN CREATOR W_FSTUWT W_FSTURWT1 W_FSTURWT2
6606 6606 6606 0 0 0
W_FSTURWT3 W_FSTURWT4 W_FSTURWT5 W_FSTURWT6 W_FSTURWT7 W_FSTURWT8
0 0 0 0 0 0
W_FSTURWT9 W_FSTURWT10 W_FSTURWT11 W_FSTURWT12 W_FSTURWT13 W_FSTURWT14
0 0 0 0 0 0
W_FSTURWT15 W_FSTURWT16 W_FSTURWT17 W_FSTURWT18 W_FSTURWT19 W_FSTURWT20
0 0 0 0 0 0
W_FSTURWT21 W_FSTURWT22 W_FSTURWT23 W_FSTURWT24 W_FSTURWT25 W_FSTURWT26
0 0 0 0 0 0
W_FSTURWT27 W_FSTURWT28 W_FSTURWT29 W_FSTURWT30 W_FSTURWT31 W_FSTURWT32
0 0 0 0 0 0
W_FSTURWT33 W_FSTURWT34 W_FSTURWT35 W_FSTURWT36 W_FSTURWT37 W_FSTURWT38
0 0 0 0 0 0
W_FSTURWT39 W_FSTURWT40 W_FSTURWT41 W_FSTURWT42 W_FSTURWT43 W_FSTURWT44
0 0 0 0 0 0
W_FSTURWT45 W_FSTURWT46 W_FSTURWT47 W_FSTURWT48 W_FSTURWT49 W_FSTURWT50
0 0 0 0 0 0
W_FSTURWT51 W_FSTURWT52 W_FSTURWT53 W_FSTURWT54 W_FSTURWT55 W_FSTURWT56
0 0 0 0 0 0
W_FSTURWT57 W_FSTURWT58 W_FSTURWT59 W_FSTURWT60 W_FSTURWT61 W_FSTURWT62
0 0 0 0 0 0
W_FSTURWT63 W_FSTURWT64 W_FSTURWT65 W_FSTURWT66 W_FSTURWT67 W_FSTURWT68
0 0 0 0 0 0
W_FSTURWT69 W_FSTURWT70 W_FSTURWT71 W_FSTURWT72 W_FSTURWT73 W_FSTURWT74
0 0 0 0 0 0
W_FSTURWT75 W_FSTURWT76 W_FSTURWT77 W_FSTURWT78 W_FSTURWT79 W_FSTURWT80
0 0 0 0 0 0
UNIT WVARSTRR PV1MATH PV2MATH PV3MATH PV4MATH
0 0 0 0 0 0
PV5MATH PV6MATH PV7MATH PV8MATH PV9MATH PV10MATH
0 0 0 0 0 0
PV1READ PV2READ PV3READ PV4READ PV5READ PV6READ
0 0 0 0 0 0
PV7READ PV8READ PV9READ PV10READ PV1SCIE PV2SCIE
0 0 0 0 0 0
PV3SCIE PV4SCIE PV5SCIE PV6SCIE PV7SCIE PV8SCIE
0 0 0 0 0 0
PV9SCIE PV10SCIE PV1MCCR PV2MCCR PV3MCCR PV4MCCR
0 0 0 0 0 0
PV5MCCR PV6MCCR PV7MCCR PV8MCCR PV9MCCR PV10MCCR
0 0 0 0 0 0
PV1MCQN PV2MCQN PV3MCQN PV4MCQN PV5MCQN PV6MCQN
0 0 0 0 0 0
PV7MCQN PV8MCQN PV9MCQN PV10MCQN PV1MCSS PV2MCSS
0 0 0 0 0 0
PV3MCSS PV4MCSS PV5MCSS PV6MCSS PV7MCSS PV8MCSS
0 0 0 0 0 0
PV9MCSS PV10MCSS PV1MCUD PV2MCUD PV3MCUD PV4MCUD
0 0 0 0 0 0
PV5MCUD PV6MCUD PV7MCUD PV8MCUD PV9MCUD PV10MCUD
0 0 0 0 0 0
PV1MPEM PV2MPEM PV3MPEM PV4MPEM PV5MPEM PV6MPEM
0 0 0 0 0 0
PV7MPEM PV8MPEM PV9MPEM PV10MPEM PV1MPFS PV2MPFS
0 0 0 0 0 0
PV3MPFS PV4MPFS PV5MPFS PV6MPFS PV7MPFS PV8MPFS
0 0 0 0 0 0
PV9MPFS PV10MPFS PV1MPIN PV2MPIN PV3MPIN PV4MPIN
0 0 0 0 0 0
PV5MPIN PV6MPIN PV7MPIN PV8MPIN PV9MPIN PV10MPIN
0 0 0 0 0 0
PV1MPRE PV2MPRE PV3MPRE PV4MPRE PV5MPRE PV6MPRE
0 0 0 0 0 0
PV7MPRE PV8MPRE PV9MPRE PV10MPRE SENWT VER_DAT
0 0 0 0 0 0
i
0
While there are numerous missing values in the dataset, our focus is primarily on specific columns. We’re interested in the columns labeled:
CNTSCHID (International School ID)
CNTSTUID (International Student ID)
ST004D01T (Student Standardized Gender)
HISEI (Highest parental occupational status based on 4-digit human coded ISCO)
PAREDINT (Index highest parental education)
HOMEPOS (Home possessions)
ESCS (Index of economic, social and cultural status)
and a series of columns related to plausible values in different subjects. These subjects include:
Mathematics (PV1MATH to PV10MATH)
Reading (PV1READ to PV10READ)
Science (PV1SCIE to PV10SCIE)
We’ll examine these columns for missing data, as they are relevant to our analysis.
We can use the select function from the dplyr package to extract the specific columns.
library(dplyr)
# Selecting specific columns explicitly
selected_columns <- stu_qqq_Sg %>%
select(CNTSCHID, CNTSTUID, ST004D01T,
HISEI, PAREDINT, HOMEPOS, ESCS,
PV1MATH, PV2MATH, PV3MATH, PV4MATH, PV5MATH, PV6MATH, PV7MATH, PV8MATH, PV9MATH, PV10MATH,
PV1READ, PV2READ, PV3READ, PV4READ, PV5READ, PV6READ, PV7READ, PV8READ, PV9READ, PV10READ,
PV1SCIE, PV2SCIE, PV3SCIE, PV4SCIE, PV5SCIE, PV6SCIE, PV7SCIE, PV8SCIE, PV9SCIE, PV10SCIE)
# Count the number of missing values per selected column
na_per_selected_column <- colSums(is.na(selected_columns))
# Printing the number of missing values per selected column
print(na_per_selected_column) CNTSCHID CNTSTUID ST004D01T HISEI PAREDINT HOMEPOS ESCS PV1MATH
0 0 0 310 57 40 47 0
PV2MATH PV3MATH PV4MATH PV5MATH PV6MATH PV7MATH PV8MATH PV9MATH
0 0 0 0 0 0 0 0
PV10MATH PV1READ PV2READ PV3READ PV4READ PV5READ PV6READ PV7READ
0 0 0 0 0 0 0 0
PV8READ PV9READ PV10READ PV1SCIE PV2SCIE PV3SCIE PV4SCIE PV5SCIE
0 0 0 0 0 0 0 0
PV6SCIE PV7SCIE PV8SCIE PV9SCIE PV10SCIE
0 0 0 0 0
The counts of missing values for the variables in question are as follows:
Mother’s Occupational Status (BMMJ1): 607 missing values
Father’s Occupational Status (BFMJ2): 849 missing values
Highest Parental Occupational Status (HISEI): 310 missing values
However, we have opted not to impute or omit these cases as we want to preserve the original dataset, maintaining data integrity. Nevertheless, we acknowledge that the missing values in socioeconomic status indicators could limit the depth of socio-economic related insights and possibly affect the generalizability of our findings. This limitation is important to bear in mind when interpreting the results of our analysis. Future research may explore the reasons for these missing values and consider the application of appropriate data imputation methods where socio-economic context is of primary interest.
2. Data Wrangling
2.1 Calculate average plausible values for each student
library(dplyr)
# Assuming 'stu_qqq_Sg' is your data frame and 'CNTSTUID' is the column with International Student IDs
# Calculate the mean PV scores for each student in each subject
student_avg_scores <- stu_qqq_Sg %>%
mutate(
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"):starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"):starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"):starts_with("PV10SCIE")), na.rm = TRUE)
)
# The resulting 'student_avg_scores' data frame now has three new columns
# 'AvgMathPV', 'AvgReadPV', and 'AvgSciePV' which are the average plausible values
# for Mathematics, Reading, and Science respectively for each student.
# View the student level averages
print(student_avg_scores)# A tibble: 6,606 × 1,282
CNT CNTRYID CNTSCHID CNTSTUID CYC NatCen STRATUM SUBNATIO REGION OECD
<chr> <dbl> <fct> <fct> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 SGP 702 70200052 70200001 08MS 070200 SGP01 7020000 70200 0
2 SGP 702 70200134 70200002 08MS 070200 SGP01 7020000 70200 0
3 SGP 702 70200112 70200003 08MS 070200 SGP01 7020000 70200 0
4 SGP 702 70200004 70200004 08MS 070200 SGP01 7020000 70200 0
5 SGP 702 70200152 70200005 08MS 070200 SGP01 7020000 70200 0
6 SGP 702 70200043 70200006 08MS 070200 SGP01 7020000 70200 0
7 SGP 702 70200049 70200007 08MS 070200 SGP01 7020000 70200 0
8 SGP 702 70200107 70200008 08MS 070200 SGP01 7020000 70200 0
9 SGP 702 70200012 70200009 08MS 070200 SGP01 7020000 70200 0
10 SGP 702 70200061 70200010 08MS 070200 SGP01 7020000 70200 0
# ℹ 6,596 more rows
# ℹ 1,272 more variables: ADMINMODE <dbl>, LANGTEST_QQQ <dbl>,
# LANGTEST_COG <dbl>, LANGTEST_PAQ <dbl>, Option_CT <dbl>, Option_FL <dbl>,
# Option_ICTQ <dbl>, Option_WBQ <dbl>, Option_PQ <dbl>, Option_TQ <dbl>,
# Option_UH <dbl>, BOOKID <dbl>, ST001D01T <dbl>, ST003D02T <dbl>,
# ST003D03T <dbl>, ST004D01T <fct>, ST250Q01JA <dbl>, ST250Q02JA <dbl>,
# ST250Q03JA <dbl>, ST250Q04JA <dbl>, ST250Q05JA <dbl>, ST250D06JA <chr>, …
2.2 Consolidate individual student performance data into school-wide metrics
We compute the average of plausible value (PV) scores for each subject at the school level. The steps to accomplish this in R include:
Aggregation of PV Scores per Subject
- Start by determining the average PV scores for every student by taking the mean across the ten plausible values for Mathematics, Reading, and Science respectively.
Consolidation by School
- Organize the dataset around the International School ID (CNTSCHID) to group students according to their school.
School-Level Averages
- For each group representing a school, compute the mean of these individual averages. This results in a single average score that represents the collective performance of students in each subject for that school.
library(dplyr)
# Calculate the mean PV scores for each student in each subject
stu_qqq_Sg <- stu_qqq_Sg %>%
mutate(AvgMathPV = rowMeans(select(., starts_with("PV1MATH"):starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"):starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"):starts_with("PV10SCIE")), na.rm = TRUE))
# Group by school ID and calculate the mean of the student averages for each school
school_avg_scores <- stu_qqq_Sg %>%
group_by(CNTSCHID) %>%
summarise(AvgMathScore = mean(AvgMathPV, na.rm = TRUE),
AvgReadScore = mean(AvgReadPV, na.rm = TRUE),
AvgScieScore = mean(AvgSciePV, na.rm = TRUE))
# View the school level averages
print(school_avg_scores)# A tibble: 164 × 4
CNTSCHID AvgMathScore AvgReadScore AvgScieScore
<fct> <dbl> <dbl> <dbl>
1 70200001 725. 653. 698.
2 70200002 535. 507. 534.
3 70200003 740. 665. 714.
4 70200004 510. 480. 494.
5 70200005 547. 516. 529.
6 70200006 487. 468. 477.
7 70200007 580. 540. 579.
8 70200008 567. 522. 540.
9 70200009 560. 536. 552.
10 70200010 531. 503. 540.
# ℹ 154 more rows
2.3 Gender
library(dplyr)
# Assuming 'stu_qqq_Sg' is your data frame, 'CNTSTUID' is the column with International Student IDs,
# and 'ST004D01T' is the column with Student Standardized Gender (1 for female; 0 for male)
# Calculate the mean PV scores for each student in each subject and map gender numeric values to labels
gender_avg_scores <- stu_qqq_Sg %>%
mutate(
Gender = ifelse(ST004D01T == 1, "Female", "Male"),
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"):starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"):starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"):starts_with("PV10SCIE")), na.rm = TRUE)
)
# Now, 'gender_avg_scores' contains the individual student's data with their gender and average scores
print(gender_avg_scores)# A tibble: 6,606 × 1,283
CNT CNTRYID CNTSCHID CNTSTUID CYC NatCen STRATUM SUBNATIO REGION OECD
<chr> <dbl> <fct> <fct> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 SGP 702 70200052 70200001 08MS 070200 SGP01 7020000 70200 0
2 SGP 702 70200134 70200002 08MS 070200 SGP01 7020000 70200 0
3 SGP 702 70200112 70200003 08MS 070200 SGP01 7020000 70200 0
4 SGP 702 70200004 70200004 08MS 070200 SGP01 7020000 70200 0
5 SGP 702 70200152 70200005 08MS 070200 SGP01 7020000 70200 0
6 SGP 702 70200043 70200006 08MS 070200 SGP01 7020000 70200 0
7 SGP 702 70200049 70200007 08MS 070200 SGP01 7020000 70200 0
8 SGP 702 70200107 70200008 08MS 070200 SGP01 7020000 70200 0
9 SGP 702 70200012 70200009 08MS 070200 SGP01 7020000 70200 0
10 SGP 702 70200061 70200010 08MS 070200 SGP01 7020000 70200 0
# ℹ 6,596 more rows
# ℹ 1,273 more variables: ADMINMODE <dbl>, LANGTEST_QQQ <dbl>,
# LANGTEST_COG <dbl>, LANGTEST_PAQ <dbl>, Option_CT <dbl>, Option_FL <dbl>,
# Option_ICTQ <dbl>, Option_WBQ <dbl>, Option_PQ <dbl>, Option_TQ <dbl>,
# Option_UH <dbl>, BOOKID <dbl>, ST001D01T <dbl>, ST003D02T <dbl>,
# ST003D03T <dbl>, ST004D01T <fct>, ST250Q01JA <dbl>, ST250Q02JA <dbl>,
# ST250Q03JA <dbl>, ST250Q04JA <dbl>, ST250Q05JA <dbl>, ST250D06JA <chr>, …
2.4 SES
# Assuming 'stu_qqq_Sg' is your original data frame
# Creating a data frame for HISEI
df_hisei <- stu_qqq_Sg %>%
filter(!is.na(HISEI)) %>%
mutate(
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"), starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"), starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"), starts_with("PV10SCIE")), na.rm = TRUE)
)
print(df_hisei)# A tibble: 6,296 × 1,282
CNT CNTRYID CNTSCHID CNTSTUID CYC NatCen STRATUM SUBNATIO REGION OECD
<chr> <dbl> <fct> <fct> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 SGP 702 70200052 70200001 08MS 070200 SGP01 7020000 70200 0
2 SGP 702 70200134 70200002 08MS 070200 SGP01 7020000 70200 0
3 SGP 702 70200112 70200003 08MS 070200 SGP01 7020000 70200 0
4 SGP 702 70200004 70200004 08MS 070200 SGP01 7020000 70200 0
5 SGP 702 70200152 70200005 08MS 070200 SGP01 7020000 70200 0
6 SGP 702 70200043 70200006 08MS 070200 SGP01 7020000 70200 0
7 SGP 702 70200049 70200007 08MS 070200 SGP01 7020000 70200 0
8 SGP 702 70200107 70200008 08MS 070200 SGP01 7020000 70200 0
9 SGP 702 70200012 70200009 08MS 070200 SGP01 7020000 70200 0
10 SGP 702 70200061 70200010 08MS 070200 SGP01 7020000 70200 0
# ℹ 6,286 more rows
# ℹ 1,272 more variables: ADMINMODE <dbl>, LANGTEST_QQQ <dbl>,
# LANGTEST_COG <dbl>, LANGTEST_PAQ <dbl>, Option_CT <dbl>, Option_FL <dbl>,
# Option_ICTQ <dbl>, Option_WBQ <dbl>, Option_PQ <dbl>, Option_TQ <dbl>,
# Option_UH <dbl>, BOOKID <dbl>, ST001D01T <dbl>, ST003D02T <dbl>,
# ST003D03T <dbl>, ST004D01T <fct>, ST250Q01JA <dbl>, ST250Q02JA <dbl>,
# ST250Q03JA <dbl>, ST250Q04JA <dbl>, ST250Q05JA <dbl>, ST250D06JA <chr>, …
library(dplyr)
# Assuming 'stu_qqq_Sg' is your original data frame
# Creating a data frame for PAREDINT
df_paredint <- stu_qqq_Sg %>%
filter(!is.na(PAREDINT)) %>%
mutate(
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"), starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"), starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"), starts_with("PV10SCIE")), na.rm = TRUE)
)
# 'df_paredint' now contains students without missing PAREDINT values and their average PV scoreslibrary(dplyr)
# Assuming 'stu_qqq_Sg' is your original data frame
# Creating a data frame for HOMEPOS
df_homepos <- stu_qqq_Sg %>%
filter(!is.na(HOMEPOS)) %>%
mutate(
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"), starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"), starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"), starts_with("PV10SCIE")), na.rm = TRUE)
)
# 'df_homepos' now contains students without missing HOMEPOS values and their average PV scoreslibrary(dplyr)
# Assuming 'stu_qqq_Sg' is your original data frame
# Creating a data frame for ESCS
df_escs <- stu_qqq_Sg %>%
filter(!is.na(ESCS)) %>%
mutate(
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"), starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"), starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"), starts_with("PV10SCIE")), na.rm = TRUE)
)
# 'df_escs' now contains students without missing ESCS values and their average PV scores3. Results
3.1 Average Plausible Values by Students in Singapore
library(plotly)
library(dplyr)
# Assuming 'student_avg_scores' already contains the average plausible values per student
# Create a histogram for each subject using plotly
fig <- plot_ly(student_avg_scores, histnorm = "percent") %>%
add_trace(
x = ~AvgMathPV,
type = 'histogram',
opacity = 0.6,
name = 'Mathematics',
marker = list(color = 'blue')
) %>%
add_trace(
x = ~AvgReadPV,
type = 'histogram',
opacity = 0.6,
name = 'Reading',
marker = list(color = 'orange')
) %>%
add_trace(
x = ~AvgSciePV,
type = 'histogram',
opacity = 0.6,
name = 'Science',
marker = list(color = 'green')
) %>%
layout(
barmode = 'overlay', # Use 'overlay' to place the histograms on top of each other
title = 'Distribution of Average Plausible Values for Each Subject',
xaxis = list(title = 'Average Plausible Values'),
yaxis = list(title = 'Percentage of Students')
)
# Print the plot
fig3.2 Average Plausible Values by School
library(plotly)
library(dplyr)
# Rank schools by AvgMathScore, AvgReadScore, and AvgScieScore and add labels for top and bottom schools
school_avg_scores <- school_avg_scores %>%
mutate(
RankMath = rank(-AvgMathScore),
RankRead = rank(-AvgReadScore),
RankScie = rank(-AvgScieScore),
LabelMath = ifelse(RankMath == 1 | RankMath == n(), as.character(CNTSCHID), NA),
LabelRead = ifelse(RankRead == 1 | RankRead == n(), as.character(CNTSCHID), NA),
LabelScie = ifelse(RankScie == 1 | RankScie == n(), as.character(CNTSCHID), NA)
)
# Find the maximum score to set the limits of the y-axis
max_score <- max(school_avg_scores$AvgMathScore, school_avg_scores$AvgReadScore, school_avg_scores$AvgScieScore)
# Create a scatter plot for each subject using plotly
fig <- plot_ly() %>%
add_trace(data = school_avg_scores, x = ~RankMath, y = ~AvgMathScore, name = 'Mathematics',
type = 'scatter', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~CNTSCHID, textposition = 'top center') %>%
add_trace(data = school_avg_scores, x = ~RankRead, y = ~AvgReadScore, name = 'Reading',
type = 'scatter', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~CNTSCHID, textposition = 'top center') %>%
add_trace(data = school_avg_scores, x = ~RankScie, y = ~AvgScieScore, name = 'Science',
type = 'scatter', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~CNTSCHID, textposition = 'top center') %>%
layout(title = 'Average Plausible Values by School',
xaxis = list(title = 'International School ID', showticklabels = FALSE, zeroline = FALSE),
yaxis = list(title = 'Average Plausible Values', range = c(0, max_score * 1.1)),
hoverlabel = list(bgcolor = "white"))
# Add labels for the top and bottom school for each subject
fig <- fig %>%
add_trace(data = subset(school_avg_scores, !is.na(LabelMath)), x = ~RankMath, y = ~AvgMathScore,
type = 'scatter', mode = 'text', text = ~LabelMath, showlegend = FALSE) %>%
add_trace(data = subset(school_avg_scores, !is.na(LabelRead)), x = ~RankRead, y = ~AvgReadScore,
type = 'scatter', mode = 'text', text = ~LabelRead, showlegend = FALSE) %>%
add_trace(data = subset(school_avg_scores, !is.na(LabelScie)), x = ~RankScie, y = ~AvgScieScore,
type = 'scatter', mode = 'text', text = ~LabelScie, showlegend = FALSE)
# Print the plot
fig3.3 Gender
library(dplyr)
library(tidyr)
library(ggplot2)
# Assuming 'stu_qqq_Sg' is your data frame
# Prepare the data with gender and average scores
gender_scores <- stu_qqq_Sg %>%
mutate(
Gender = ifelse(ST004D01T == 1, "Female", "Male"),
AvgMathPV = rowMeans(select(., starts_with("PV1MATH"):starts_with("PV10MATH")), na.rm = TRUE),
AvgReadPV = rowMeans(select(., starts_with("PV1READ"):starts_with("PV10READ")), na.rm = TRUE),
AvgSciePV = rowMeans(select(., starts_with("PV1SCIE"):starts_with("PV10SCIE")), na.rm = TRUE)
)
# Reshape the data to long format
long_format <- gender_scores %>%
select(CNTSTUID, Gender, AvgMathPV, AvgReadPV, AvgSciePV) %>%
pivot_longer(
cols = c(AvgMathPV, AvgReadPV, AvgSciePV),
names_to = "Subject",
values_to = "Score"
)
# Adjust the Subject names if necessary
long_format$Subject <- recode(long_format$Subject,
AvgMathPV = "Math",
AvgReadPV = "Reading",
AvgSciePV = "Science")
# Create the boxplot using the default ggplot2 color scheme
boxplot <- ggplot(long_format, aes(x = Gender, y = Score, fill = Gender)) +
geom_boxplot() +
facet_wrap(~Subject, strip.position = "bottom") + # Move the facet labels to the bottom
labs(
title = "Score Distribution by Gender and Subject",
x = "", # Remove the x-axis label
y = "Average Plausible Values" # Change the y-axis label to 'Average Plausible Values'
) +
theme(
axis.title.x = element_blank(), # Remove the x-axis title
axis.text.x = element_blank(), # Remove the x-axis text
axis.ticks.x = element_blank(), # Remove the x-axis ticks
strip.background = element_blank(), # Remove facet label background
strip.text.x = element_text(size = 12), # Adjust facet label text size
plot.title = element_text(hjust = 0.5) # Center the title
)
# Print the boxplot
print(boxplot)
3.4 SES part 1
library(ggplot2)
library(tidyr)
library(dplyr)
# Reshaping the data
long_df_hisei <- df_hisei %>%
pivot_longer(
cols = c("AvgMathPV", "AvgReadPV", "AvgSciePV"),
names_to = "Subject",
values_to = "AveragePV"
) %>%
mutate(Subject = recode(Subject,
"AvgMathPV" = "Math",
"AvgReadPV" = "Reading",
"AvgSciePV" = "Science"))
# Creating a scatter plot
fig <- plot_ly(long_df_hisei, x = ~HISEI, y = ~AveragePV, type = 'scatter', mode = 'markers',
color = ~Subject, colors = RColorBrewer::brewer.pal(3, "Dark2"),
marker = list(size = 5, opacity = 0.5),
hoverinfo = 'text',
text = ~paste("Subject:", Subject, "<br>HISEI:", HISEI, "<br>Average PV:", AveragePV))
# Adding trend lines for each subject
subjects <- unique(long_df_hisei$Subject)
for (subject in subjects) {
subject_data <- long_df_hisei[long_df_hisei$Subject == subject,]
lm_model <- lm(AveragePV ~ HISEI, data = subject_data)
fig <- fig %>%
add_trace(data = subject_data, x = ~HISEI, y = ~fitted(lm_model), type = 'scatter', mode = 'lines',
line = list(color = RColorBrewer::brewer.pal(3, "Dark2")[which(subjects == subject)], width = 1),
showlegend = FALSE)
}
# Layout adjustments
fig <- fig %>%
layout(title = 'Relationship between HISEI and Average PV Scores',
xaxis = list(title = 'HISEI', zeroline = FALSE, range = c(0, max(long_df_hisei$HISEI, na.rm = TRUE))),
yaxis = list(title = 'Average PV Score', zeroline = FALSE, range = c(0, max(long_df_hisei$AveragePV, na.rm = TRUE))),
legend = list(orientation = 'v', x = 1.05, y = 0.5, xanchor = 'left', yanchor = 'middle'))
# Print the interactive plot
fig